import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import neighbors
from sklearn import preprocessing
from sklearn import tree
from sklearn import metrics
from sklearn import model_selection
from sklearn import linear_model
from sklearn import feature_selection
from sklearn import tree
from sklearn import ensemble
from sklearn import preprocessing
from sklearn import neighbors
from sklearn import naive_bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix,classification_report
from tempfile import TemporaryFile
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import roc_auc_score
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.preprocessing import label_binarize
from sklearn import svm
from itertools import cycle
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
df=pd.read_csv("C:/Users/Apratim/Desktop/C/ml/datasets/handset_segmentation/train.csv")
def brief_info():
print(df.info())
print(df.shape)
print(df.head())
print(df.tail())
print('Discrete columns are:')
for cols in df.columns:
if (df[cols].dtypes)=='int64':
print(cols,'|',end=' ')
print('\n__________________________')
print('Continous Columns are:')
for cols in df.columns:
if (df[cols].dtypes)!='int64':
print(cols)
return df.describe()
brief_info()
l4=[]
def seperator():
l1=[]
m=df.columns
for i in range(len(m)):
l1.append(np.array(df[m[i]]))
l2=[]
for j in range(len(l1)):
l2.append(np.unique(l1[j]))
l3=[]
for i in range(len(l2)):
if len(l2[i])==2:
l3.append(i)
for i in range(len(l3)):
l4.append(m[l3[i]])
l5=list(set(df.columns)-set(l4))
print("Boolean valued columns are:",l4)
print("______________________")
print("Rest of the columns are:",l5)
print("______________________")
for item in l4:
print(df[item].value_counts())
seperator()
X=df.drop(columns=["price_range"])
y=df.price_range
sns.pairplot(data=df,hue='price_range')
def plot():
for i in range(len(l4)):
sns.countplot(x=l4[i],hue='price_range',data=df)
plt.show()
plot()
def showplot():
l=['ram','int_memory','px_height','talk_time']
for i in l:
fig=sns.FacetGrid(data=df,hue="price_range")
fig.map(sns.kdeplot,i)
fig.add_legend()
plt.show()
showplot()
sns.jointplot(x='ram',y='price_range',data=df,color='red',kind='kde')
sns.jointplot(x='int_memory',y='price_range',data=df,color='blue',kind='kde')
sns.pointplot(y='px_height',x='price_range',data=df)
sns.pointplot(y='clock_speed',x='price_range',data=df)
sns.jointplot(x='price_range',y='clock_speed',data=df,color='green',kind='kde')
sns.pointplot(y='talk_time',x='price_range',data=df)
sns.boxplot(x="price_range",y="battery_power",data=df)
plt.show()
labels4g=["4g_supported","Not_supported"]
values4g=df["four_g"].value_counts().values
fig1,ax1=plt.subplots()
ax1.pie(values4g,labels=labels4g,shadow=True,startangle=90)
plt.show()
plt.figure(figsize=(10,6))
df["fc"].hist(alpha=0.5,color="blue",label="Front Camera")
df["pc"].hist(alpha=0.5,color="red",label="Primary Camera")
plt.legend()
plt.xlabel("MegaPixels")
plt.figure(figsize=(10,6))
df["px_height"].hist(alpha=0.5,color="yellow",label="Pixel Height")
df["px_width"].hist(alpha=0.5,color="pink",label="Pixel Width")
plt.legend()
plt.xlabel("Pixels")
plt.figure(figsize=(10,6))
df["sc_h"].hist(alpha=0.5,color="green",label="Screen Height")
df["sc_w"].hist(alpha=0.5,color="red",label="Screen Width")
plt.legend()
plt.xlabel("Screen Size")
X1=preprocessing.scale(X,with_mean=True,with_std=True,copy=True)
df1=pd.DataFrame(X1)
df1.columns=X.columns
X2=df1
def mutual(X,y):
mu=feature_selection.mutual_info_classif(X,y)
muser=pd.Series(mu)
muser.index=df1.columns.values
muser.sort_values(ascending=False).plot.bar()
mutual(X2,y)
list1=[]
def feature(n):
bestfeatures=SelectKBest(k=n)
fit = bestfeatures.fit(X2,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(df1.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score'] #naming the dataframe columns
f1=featureScores.nlargest(n,'Score')
f1.plot(kind='bar')
plt.show()
set1=set(f1["Specs"])
#Feature importance method
print("==============================================")
model1 = ExtraTreesClassifier()
model1.fit(X2,y)
#use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model1.feature_importances_,index=df1.columns)
f2=feat_importances.nlargest(n)
set2=set(f2.index)
f2.plot(kind='bar')
plt.show()
set3=set1.intersection(set2)
set4=set(df.columns)
lset=list(set4.difference(set3))
for item in lset:
list1.append(item)
feature(5)
list1.remove("price_range")
X2=df1.drop(columns=list1)
##Train,test split
Xtrain,Xtest,ytrain,ytest=model_selection.train_test_split(X2,y,test_size=.40,random_state=42)
model=linear_model.LogisticRegression()
treemodel=tree.DecisionTreeClassifier(max_depth=2)
gnbobj=naive_bayes.GaussianNB()
rfmodel=ensemble.RandomForestClassifier()
b=[]
def best_n(n):
param_grid={'n_estimators': [200,n]}
CV_rfmodel=GridSearchCV(estimator=rfmodel,param_grid=param_grid,cv=5)
CV_rfmodel.fit(Xtrain, ytrain)
b.append(CV_rfmodel.best_params_['n_estimators'])
best_n(500)
rfmodel=ensemble.RandomForestClassifier(n_estimators=b[0])
a=[]
def best_k(n):
k=[l for l in range(5,n,2)]
knnobj=neighbors.KNeighborsClassifier(n_neighbors=k)
grid={"n_neighbors":k}
grid_obj = GridSearchCV(estimator=knnobj,param_grid=grid)
grid_fit =grid_obj.fit(Xtrain,ytrain)
knnobj = grid_fit.best_estimator_
knnobj.fit(Xtrain,ytrain)
a.append(grid_fit.best_params_['n_neighbors'])
best_k(17)
knnobj=neighbors.KNeighborsClassifier(n_neighbors=a[0])
def cross_entropy(y,p):
m = y.shape[0]
log_likelihood = -np.log(p[range(m),y])
loss = np.sum(log_likelihood)/m
return loss
def multiclass_roc_auc_score(ytest,testp, average="macro"):
lb = LabelBinarizer()
lb.fit(ytest)
ytest = lb.transform(ytest)
testp = lb.transform(testp)
return roc_auc_score(ytest,testp,average=average)
li=[]
li1=[]
li2=[]
def model_built(m):
m.fit(Xtrain,ytrain)
testp=m.predict(Xtest)
print('Accuracy:',metrics.accuracy_score(ytest,testp))
li2.append(metrics.accuracy_score(ytest,testp))
print(classification_report(ytest,testp))
m2=metrics.confusion_matrix(ytest,testp)
print(m2)
p=m.predict_proba(Xtest)
print("Loss:",cross_entropy(ytest,p))
print("\nroc_auc_score:",multiclass_roc_auc_score(ytest,testp))
print(" ")
li.append(cross_entropy(ytest,p))
li1.append(multiclass_roc_auc_score(ytest,testp))
print("=======================================")
plt.figure(figsize=(6,6))
sns.heatmap(m2,annot=True)
plt.show()
print(m)
def fit():
l6=[model,treemodel,knnobj,gnbobj,rfmodel]
for item in l6:
print(" ")
model_built(item)
print("==========================================")
dff=pd.DataFrame({"Cross Entropy Loss":li,"Roc_Auc":li1,"Accuracy":li2},index=['model','treemodel','knnobj','gnbobj','rfmodel'])
print(" ")
print("The Entropy Loss of the best model is :",dff["Cross Entropy Loss"].min(),dff[["Cross Entropy Loss"]].idxmin())
print("The max auc score:",dff["Roc_Auc"].max(),dff[["Roc_Auc"]].idxmax())
print("The max accuracy :",dff["Accuracy"].max(),dff[["Accuracy"]].idxmax())
li.clear()
li1.clear()
li2.clear()
return dff
fit()